from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../../')
#import Datanalytics as da
from JKBio import TerraFunction as terra
%load_ext autoreload
%autoreload 2
from JKBio import Helper as h
import pickle
from taigapy import TaigaClient
tc = TaigaClient()
import numpy as np
import itertools
from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
import matplotlib.pyplot as plt
%load_ext rpy2.ipython
import seaborn as sns
import gseapy
import matplotlib.pyplot as plt
import networkx as nx
from JKBio.helper import pyDESeq2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
! gsutil mv gs://transfer-amlproject/*MP7624* gs://transfer-amlproject/RNPv2/
! gsutil -m cp -r gs://transfer-amlproject/RNPv3 gs://amlproject/RNA/
! gsutil ls gs://amlproject/
sampleset='RNPv3'
terra.uploadFromFolder('amlproject','RNPv2/',
'broad-firecloud-ccle/hg38_RNAseq',samplesetname=sampleset,
fformat="fastqR1R2", sep='_MP7624')
wm = dm.WorkspaceManager('broad-firecloud-ccle/hg38_RNAseq')
submission_id = wm.create_submission("star_v1-0_BETA_cfg", sampleset, 'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
submission_id = wm.create_submission("rsem_v1-0_BETA_cfg",
sampleset,'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
submission_id = wm.create_submission("rsem_aggregate_results_v1-0_BETA_cfg",
sampleset)
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
results = wm.get_sample_sets().loc[sampleset]
rsem_genes_expected_count = results['rsem_genes_expected_count']
results
mkdir ../../data/RNPv3
! gsutil cp $rsem_genes_expected_count ../../data/RNPv3/
file = '../../data/RNPv3/'+rsem_genes_expected_count.split('/')[-1]
file
! gunzip $file
file
rsem_genes_expected_count = pd.read_csv(file[:-3], sep='\t')
rsem_genes_expected_count = pd.read_csv("../../data/RNPv3/RNPv3.rsem_genes_expected_count.txt", sep='\t')
data = rsem_genes_expected_count.drop("transcript_id(s)",1)
data["gene_id"] = h.convertGenes(data['gene_id'])[0]
data=data.set_index('gene_id')
data
rename = {"1": "mr120-MV411-RNP_IRF2BP2-r4",
"2": "mr121-MV411-RNP_IRF2BP2-r5",
"3": "mr122-MV411-RNP_IRF2BP2-r6",
"4": "mr123-MV411-RNP_IRF8-r4",
"5": "mr124-MV411-RNP_IRF8-r5",
"6": "mr125-MV411-RNP_IRF8-r6",
"7": "mr126-MV411-RNP_MEF2D-r4",
"8": "mr127-MV411-RNP_MEF2D-r5",
"9": "mr128-MV411-RNP_MEF2D-r6",
"10": "mr129-MV411-RNP_MYC-r4",
"11": "mr130-MV411-RNP_MYC-r5",
"12": "mr131-MV411-RNP_MYC-r6",
"13": "mr132-MV411-RNP_RUNX1-r4",
"14": "mr133-MV411-RNP_RUNX1-r5",
"15": "mr134-MV411-RNP_RUNX1-r6",
"16": "mr135-MV411-RNP_RUNX2-r4",
"17": "mr136-MV411-RNP_RUNX2-r5",
"18": "mr137-MV411-RNP_RUNX2-r6",
"19": "mr138-MV411-RNP_SPI1-r4",
"20": "mr139-MV411-RNP_SPI1-r5",
"21": "mr140-MV411-RNP_SPI1-r6",
"22": "mr141-MV411-RNP_ZMYND8-r4",
"23": "mr142-MV411-RNP_ZMYND8-r5",
"24": "mr143-MV411-RNP_ZMYND8-r6",
"25": "mr144-MV411-RNP_LMO2-r4",
"26": "mr145-MV411-RNP_LMO2-r5",
"27": "mr146-MV411-RNP_LMO2-r6",
"28": "mr147-MV411-RNP_LYL1-r4",
"29": "mr148-MV411-RNP_LYL1-r5",
"30": "mr149-MV411-RNP_LYL1-r6",
"31": "mr150-MV411-RNP_MAX-r4",
"32": "mr151-MV411-RNP_MAX-r5",
"33": "mr152-MV411-RNP_MAX-r6",
"34": "mr153-MV411-RNP_ZEB2-r4",
"35": "mr154-MV411-RNP_ZEB2-r5",
"36": "mr155-MV411-RNP_ZEB2-r6",
"37": "mr156-MV411-RNP_MEF2C-r4",
"38": "mr157-MV411-RNP_MEF2C-r5",
"39": "mr158-MV411-RNP_MEF2C-r6",
"40": "mr159-MV411-RNP_MEIS1-r4",
"41": "mr160-MV411-RNP_MEIS1-r5",
"42": "mr161-MV411-RNP_MEIS1-r6",
"43": "mr162-MV411-RNP_FLI1-r4",
"44": "mr163-MV411-RNP_FLI1-r5",
"45": "mr164-MV411-RNP_FLI1-r6",
"46": "mr165-MV411-RNP_ELF2-r4",
"47": "mr166-MV411-RNP_ELF2-r5",
"48": "mr167-MV411-RNP_ELF2-r6",
"49": "mr168-MV411-RNP_GFI1-r4",
"50": "mr169-MV411-RNP_GFI1-r5",
"51": "mr170-MV411-RNP_GFI1-r6",
"52": "mr171-MV411-RNP_IKZF1-r4",
"53": "mr172-MV411-RNP_IKZF1-r5",
"54": "mr173-MV411-RNP_IKZF1-r6",
"55": "mr174-MV411-RNP_CEBPA-r4",
"56": "mr175-MV411-RNP_CEBPA-r5",
"57": "mr176-MV411-RNP_CEBPA-r6",
"58": "mr177-MV411-RNP_MYB-r4",
"59": "mr178-MV411-RNP_MYB-r5",
"60": "mr179-MV411-RNP_MYB-r6",
"61": "mr180-MV411-RNP_MYBL2-r1",
"62": "mr181-MV411-RNP_MYBL2-r2",
"63": "mr182-MV411-RNP_MYBL2-r3",
"64": "mr183-MV411-RNP_HOXA9-r4",
"65": "mr184-MV411-RNP_HOXA9-r5",
"66": "mr185-MV411-RNP_HOXA9-r6",
"67": "mr186-MV411-RNP_AAVS1-r1",
"68": "mr187-MV411-RNP_AAVS1-r2",
"69": "mr188-MV411-RNP_AAVS1-r3",
"70": "mr189-MV411-RNP_SP1-r4",
"71": "mr190-MV411-RNP_SP1-r5",
"72": "mr191-MV411-RNP_SP1-r6",
"73": "mr192-MV411-RNP_SP1-r7"}
data.columns
data.columns = [rename[i] for i in data.columns]
data
filter some more
toremove = np.argwhere(data.values.var(1)==0)
toremove.ravel()
toremove.shape
data = data.drop(data.iloc[toremove.ravel()].index,0)
data.shape
ERCC = data[~data.index.str.contains('ENSG00')]
ensg = data[data.index.str.contains('ENSG00')]
data = data[~data.index.str.contains('ENSG00')]
renormalize the data
len(ERCC)
ERCC
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
%%R
library('erccdashboard')
ERCC = ERCC.astype(int)
ERCC['Feature'] = ERCC.index
sns.heatmap(np.log2(ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SP1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values / ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SP1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values.mean(0)+1))
experiments = list(set([i.split('-')[2] for i in ERCC.columns[:-1]]))
experiments.remove("RNP_AAVS1")
#TODO: compute the mass from concentration
###################################################
### code chunk number 3: defineInputData
###################################################
%R datType = "count" # "count" for RNA-Seq data, "array" for microarray data
%R isNorm = F # flag to indicate if input expression measures are already normalized, default is FALSE
%R filenameRoot = "RNPv2" # user defined filename prefix for results files
%R sample2Name = "AAAVS1" # name for sample 2 in the experiment
%R erccmix = "Single" # name of ERCC mixture design, "RatioPair" is default
%R erccdilution = 1/100 # dilution factor used for Ambion spike-in mixtures
%R spikeVol = 1 # volume (in microliters) of diluted spike-in mixture added to total RNA mass
%R choseFDR = 0.1 # user defined false discovery rate (FDR), default is 0.05
cols = list(ERCC.columns)
cols.sort()
res={}
for val in experiments:
d = {}
e=0
d.update({
'Feature':'Feature'
})
for i in cols[:-1]:
if val+'-' in i:
e+=1
d.update({i: val.split('_')[-1]+'_'+str(e)})
d.update({
'mr186-MV411-RNP_AAVS1-r1': 'AAAVS1_1',
'mr187-MV411-RNP_AAVS1-r2': 'AAAVS1_2',
'mr188-MV411-RNP_AAVS1-r3': 'AAAVS1_3'
})
a = ERCC[list(d.keys())].rename(columns=d)
a.to_csv('../data/ERCC_estimation.csv', index=None)
val = val.split('_')[-1]
torm = 'RNPv2.'+val+'.AAAVS1.All.Pvals.csv'
! rm $torm
%R -i val print(val)
%R print(sample2Name)
%R a <- read.csv('../data/ERCC_estimation.csv')
%R print(head(a))
%R exDat = ''
%R totalRNAmass <- 0.5
try:
%R -i val exDat = initDat(datType = datType, isNorm = isNorm, exTable = a, filenameRoot = filenameRoot, sample1Name = val, sample2Name = sample2Name, erccmix = erccmix, erccdilution = erccdilution, spikeVol = spikeVol, totalRNAmass = totalRNAmass, choseFDR = choseFDR)
%R exDat = est_r_m(exDat)
%R exDat = dynRangePlot(exDat)
except Warning:
print("failed for "+val)
continue
except:
print('worked for '+val)
%R print(summary(exDat))
%R grid.arrange(exDat$Figures$dynRangePlot)
%R grid.arrange(exDat$Figures$r_mPlot)
%R grid.arrange(exDat$Figures$rangeResidPlot)
%R -o rm rm <- exDat$Results$r_m.res$r_m.mn
%R -o se se <- exDat$Results$r_m.res$r_m.mnse
res[val] = (rm[0],se[0])
for i, v in res.items():
if abs(v[0]) > 3*v[1]:
print(i, v[0])
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'AAVS1' in i]].mean()
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'SPI1' in i]].mean()
scaling = res
scaling
h.dictToFileToFile(scaling,"../results/RNPv2/scaling.json")
scaling = h.fileToDict("../results/RNPv2/scaling.json")
%matplotlib inline
ig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(data.corr(),
xticklabels=data.columns,
yticklabels=data.columns, ax=ax)
model = AgglomerativeClustering(n_clusters=15,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(data.corr())
ii = itertools.count(data.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
data.to_csv('../results/RNPv2/counts.csv')
data = pd.read_csv('../results/RNPv2/counts.csv',index_col=0)
%matplotlib inline
sns.clustermap(data.corr(), figsize=(20, 20))
plt.savefig('../results/RNPv2/cluster_corr_count.pdf')
data.sum().tolist()
data.shape
data
experiments = list(set([i.split('-')[2] for i in data.columns[:-1]]))
experiments
experiments.remove("RNP_AAVS1")
data['gene_id'] = data.index
data
housekeeping = ["C1orf43", "CHMP2A", "EMC7", "GPI", "PSMB2", "PSMB4", "RAB7A", "REEP5", "SNRPD3", "VCP", "VPS29"]
# from https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf
#results = {}
for val in experiments:
print(val)
design = pd.DataFrame(index=data.columns[:-1], columns=['DMSO','Target'],
data=np.array([[1 if 'RNP_AAVS1' in i else 0 for i in data.columns[:-1]],[1 if val+'-' in i else 0 for i in data.columns[:-1]]]).T)
design.index = design.index.astype(str).str.replace('-','.')
deseq = pyDESeq2.pyDESeq2(count_matrix=data, design_matrix = design,
design_formula='~DMSO + Target', gene_column="gene_id")
if abs(scaling[val.split('_')[1]][0]) > scaling[val.split('_')[1]][1]:
print("estimating sizeFactors for this one")
deseq.run_estimate_size_factors(controlGenes=data.gene_id.str.contains("ERCC-"))
elif val in results:
continue
deseq.run_deseq()
deseq.get_deseq_result()
r = deseq.deseq_result
r.pvalue = np.nan_to_num(np.array(r.pvalue), 1)
r.log2FoldChange = np.nan_to_num(np.array(r.log2FoldChange), 0)
results[val] = r
results
for val in experiments:
a = h.volcano(results[val],tohighlight=ctf,title=val, maxvalue= 60, searchbox=True, showlabels=True)
try:
show(a)
except RuntimeError:
show(a)
for k, val in results.items():
val.to_csv('../results/RNPv2/deseq_'+k+".csv")
results = {}
des = ! ls ../results/RNPv2/deseq_RNP_*.csv
for val in des:
results["RNP_"+val.split('RNP_')[1].split('.')[0]] = pd.read_csv(val,index_col=0)
results.keys()
results.pop('RNP_all')
tosave = pd.DataFrame(index=results['RNP_CEBPA'].index)
for k,v in results.items():
tosave[k+'_fc_log2'] = v.log2FoldChange
tosave[k+'_padj'] = v.padj
tosave[k+'_pval'] = v.pvalue
tosave.to_csv('../results/RNPv2/deseq_RNP_all.csv')
ctf.extend(['IRF2BP2','MYBL2','IKZF1'])
deseq = pd.DataFrame(index=ctf)
for k, val in results.items():
deseq[k] = [i.log2FoldChange if i.pvalue<0.05 else 0 for a, i in val.loc[ctf].iterrows()]
deseq
fig = sns.clustermap(figsize=(25,20), cmap=plt.cm.RdYlBu, data=deseq,vmin=-1,vmax=1,xticklabels=deseq.columns, yticklabels=deseq.index)
fig.savefig('../results/RNPv2/clustermap_ctf_deseq_all_scaled.pdf')
deseq.columns = [i.split('_')[1] for i in deseq.columns]
deseq = deseq.loc[deseq.columns]
deseq.to_csv('../results/RNPv2/deseq_CTFmat_allscaled.csv')
deseq = pd.read_csv('../results/RNPv2/deseq_CTFmat.csv',index_col=0)
net = nx.from_pandas_adjacency(((deseq < -0.3) | (deseq > 0.3)).T,create_using=nx.DiGraph)
pos = nx.nx_agraph.graphviz_layout(net, prog="neato")
colors = ['red' if deseq.loc[i[1],i[0]]> 0 else 'blue' for i in net.edges]
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True,edge_color=colors)
plt.show()
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True,edge_color=colors)
plt.show()
deseq[(deseq > -0.3) & (deseq < 0.3)]=0
net = nx.from_pandas_adjacency(deseq.T,create_using=nx.DiGraph)
pos = nx.nx_agraph.graphviz_layout(net, prog='dot')
colors = [-deseq.loc[i[1],i[0]] for i in net.edges]
colors = [i/-min(colors) if i <0 else i/max(colors) for i in colors]
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True, edge_color=colors,edge_cmap=plt.cm.RdYlBu)
plt.show()
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True, edge_color=colors,edge_cmap=plt.cm.RdYlBu)
plt.show()
col = {v:i for i, v in enumerate(set([i.split('-')[2] for i in data.columns[:-1]]))}
red = PCA(2).fit_transform(data[data.columns[:-1]].T)
h.scatter(red, labels=data.columns[:-1], radi=60000, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
red = PCA(30).fit_transform(data[data.columns[:-1]].T)
red = TSNE(2,4).fit_transform(red)
mr129-MYC-r4 seems weird
h.scatter(red, labels=data.columns[:-1], radi=70, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
pca = PCA(20)
red = pca.fit_transform(data[data.columns[:-1]].T)
pca.explained_variance_ratio_
data
res = {}
experiments
res
for val in experiments:
print(val)
totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
print("rescaling this one")
cols = [i for i in totest.columns if val+'-' in i]
totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
else:
continue
res[val] = gseapy.gsea(data=totest, gene_sets='WikiPathways_2013',
cls= cls, no_plot=False, processes=8)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
with open('../data/pathways/wikipathway_RNPv2', 'wb') as f:
pickle.dump(res,f)
with open('../data/pathways/wikipathway_RNPv2','rb') as f:
res = pickle.load(f)
import matplotlib.pyplot as plt
%matplotlib inline
for val in experiments:
res[val].res2d['Term'] = [i[3:].split('WP')[0] for i in res[val].res2d['Term']]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
plt.show()
a = set()
for k, val in res.items():
a.update(set(val.res2d.index))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
for i,v in val.res2d.iterrows():
a[i][n] = v.es
res = pd.DataFrame(a, index=res.keys())
res.columns = [i[3:].split('WP')[0] for i in res.columns]
res.index = [i.split('_')[1] for i in res.index]
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1,xticklabels=res.columns, yticklabels=res.index)
res.to_csv('../results/RNPv2/wikipathway_gsea.csv')
fig.savefig("../results/RNPv2/enriched_terms_scaled_gsea.pdf")
res = {}
for i, val in enumerate(['RNP_MYB']):
print(val)
totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
print("rescaling this one")
cols = [i for i in totest.columns if val+'-' in i]
totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
elif val in res:
continue
res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015',
cls= cls, no_plot=False, processes=8)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
plt.figure(i)
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
for i, val in enumerate(experiments):
print(val)
totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
print("rescaling this one")
cols = [i for i in totest.columns if val+'-' in i]
totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
elif val in res:
continue
res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015',
cls= cls, no_plot=False, processes=8)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
plt.figure(i)
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
with open('../data/pathways/GO_Biological_Process_2015_RNPv2', 'wb') as f:
pickle.dump(res,f)
with open('../data/pathways/GO_Biological_Process_2015_RNPv2','rb') as f:
res = pickle.load(f)
for i, v in res.items():
res[i].res2d['Term'] = [i.split('(GO')[0] for i in v.res2d['Term']]
creating matrices
a = set()
for k, val in res.items():
a.update(set(val.res2d.Term))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
for i,v in val.res2d.iterrows():
a[v.Term][n] = v.es
res = pd.DataFrame(a, index=res.keys())
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1, yticklabels=res.index ,cmap=plt.cm.RdYlBu)
fig.savefig("../results/RNPv2/enriched_terms_scaled_gsea.pdf")
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(res)
sort = labels.argsort()
sns.clustermap(-res.T.corr(),cmap=plt.cm.RdYlBu,vmin=-1,vmax=1)
a = h.plotCorrelationMatrix(res.values[sort], res.index[sort].tolist(), interactive=True, title="RNP2_bioproc_corr")#,colors=[labels[i] for i in sort])
red = PCA(2).fit_transform(res)
h.scatter(red, labels=res.index, radi=1, colors=labels, showlabels=True)
red = TSNE(2,2).fit_transform(res)
h.scatter(red, labels=res.index, radi=9, colors=labels)
res.to_csv('../results/RNPv2/biopathway_gsea.csv')
res = pd.read_csv('../results/RNPv2/biopathway_gsea.csv',index_col=0)
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
data[i]=v.log2FoldChange
model = AgglomerativeClustering(n_clusters=8,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(data.values.T)
ii = itertools.count(data.values.shape[1])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(data.valuees)
sns.clustermap(data)
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation")#,colors=[labels[i] for i in sort])
## Filtered version (set to 0 genes with low p_value)
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
v.loc[v[v.pvalue>0.01].index,"log2FoldChange"]==0
data[i]=v.log2FoldChange
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation",)
mostvar = data[(~data.index.str.contains('ERCC-')) & (data.var(1)>4)]
mostvar
sns.clustermap(-mostvar.corr(), cmap=plt.cm.RdYlBu, vmin=-1, vmax=1)
data.corr()